{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 3 - Time Series Data Preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Python for time series data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Common data preparation operations for time series"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Timestamps vs. Time Spans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import datetime as dt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Timestamp(dt.datetime(2014, 5, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Timestamp(\"2014-06-01\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Timestamp(2014, 6, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Period(\"2014-06\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Period(\"2014-06\", freq=\"D\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dates = [\n",
    "    pd.Timestamp(\"2014-06-01\"),\n",
    "    pd.Timestamp(\"2014-06-02\"),\n",
    "    pd.Timestamp(\"2014-06-03\"),\n",
    "]\n",
    "\n",
    "ts_data = pd.Series(np.random.randn(3), dates)\n",
    "\n",
    "type(ts_data.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "periods = [pd.Period(\"2014-01\"), pd.Period(\"2014-02\"), pd.Period(\"2014-03\")]\n",
    "ts_data = pd.Series(np.random.randn(3), periods)\n",
    "type(ts_data.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Converting to timestamps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_datetime(pd.Series([\"Jul 31, 2012\", \"2012-01-10\", None]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_datetime([\"2012/11/23\", \"2012.12.31\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_datetime([\"04-01-2014 10:00\"], dayfirst=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_datetime([\"14-01-2014\", \"01-14-2012\"], dayfirst=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Providing a Format Argument"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_datetime(\"2018/11/12\", format=\"%Y/%m/%d\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_datetime(\"11-11-2018 00:00\", format=\"%d-%m-%Y %H:%M\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Indexing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "from common.utils import download_file, extract_data, load_data\n",
    "\n",
    "pd.options.display.float_format = \"{:,.2f}\".format\n",
    "np.set_printoptions(precision=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = \"./data\"\n",
    "\n",
    "if not os.path.exists(data_dir):\n",
    "    os.mkdir(data_dir)\n",
    "\n",
    "if not os.path.exists(os.path.join(data_dir, \"energy.csv\")):\n",
    "    download_file(\"https://mlftsfwp.blob.core.windows.net/mlftsfwp/GEFCom2014.zip\")\n",
    "    shutil.move(\"GEFCom2014.zip\", os.path.join(data_dir, \"GEFCom2014.zip\"))\n",
    "    extract_data(data_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load = load_data(data_dir)[[\"load\"]]\n",
    "ts_data_load.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load[:5].index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load[::2].index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load[\"2012-6-01\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load[\"2012-1\":\"2012-2-28\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load[\"2012-1\":\"2012-1-2 00:00:00\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load.truncate(before=\"2013-11-01\", after=\"2013-11-02\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Frequency conversion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data = load_data(data_dir)\n",
    "ts_data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily_ts_data = ts_data.asfreq(pd.offsets.BDay())\n",
    "daily_ts_data.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily_ts_data.asfreq(pd.offsets.BDay(), method=\"pad\")\n",
    "daily_ts_data.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Time series exploration and understanding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How to get started with time series data analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "import matplotlib\n",
    "import matplotlib.dates as mdates\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.api as sm\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load = ts_data[\"load\"]\n",
    "decomposition = sm.tsa.seasonal_decompose(\n",
    "    ts_data_load[\"2012-07-01\":\"2012-12-31\"], model=\"additive\"\n",
    ")\n",
    "\n",
    "fig = decomposition.plot()\n",
    "matplotlib.rcParams[\"figure.figsize\"] = [10.0, 6.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "decomposition = sm.tsa.seasonal_decompose(ts_data_load, model=\"additive\")\n",
    "\n",
    "fig, ax = plt.subplots()\n",
    "ax.grid(True)\n",
    "\n",
    "year = mdates.YearLocator(month=1)\n",
    "month = mdates.MonthLocator(interval=1)\n",
    "\n",
    "year_format = mdates.DateFormatter(\"%Y\")\n",
    "month_format = mdates.DateFormatter(\"%m\")\n",
    "\n",
    "ax.xaxis.set_minor_locator(month)\n",
    "ax.xaxis.grid(True, which=\"minor\")\n",
    "ax.xaxis.set_major_locator(year)\n",
    "ax.xaxis.set_major_formatter(year_format)\n",
    "\n",
    "plt.plot(ts_data_load.index, ts_data_load, c=\"blue\")\n",
    "plt.plot(decomposition.trend.index, decomposition.trend, c=\"yellow\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Cleaning of Missing Values in the Time Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load.interpolate(limit=8, method=\"linear\", limit_direction=\"both\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp_mode = np.asscalar(stats.mode(ts_data[\"temp\"]).mode)\n",
    "ts_data[\"temp\"] = ts_data[\"temp\"].fillna(temp_mode)\n",
    "ts_data.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Time Series Data Normalization and Standardization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import Series\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "values = ts_data_load.values\n",
    "values = values.reshape((len(values), 1))\n",
    "\n",
    "scaler = MinMaxScaler(feature_range=(0, 1))\n",
    "scaler = scaler.fit(values)\n",
    "print(\"Min: %f, Max: %f\" % (scaler.data_min_, scaler.data_max_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "normalized = scaler.transform(values)\n",
    "for i in range(5):\n",
    "    print(normalized[i])\n",
    "\n",
    "inversed = scaler.inverse_transform(normalized)\n",
    "for i in range(5):\n",
    "    print(inversed[i])\n",
    "\n",
    "from math import sqrt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "values = ts_data_load.values\n",
    "values = values.reshape((len(values), 1))\n",
    "\n",
    "scaler = StandardScaler()\n",
    "scaler = scaler.fit(values)\n",
    "print(\"Mean: %f, StandardDeviation: %f\" % (scaler.mean_, sqrt(scaler.var_)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "normalized = scaler.transform(values)\n",
    "for i in range(5):\n",
    "    print(normalized[i])\n",
    "\n",
    "inversed = scaler.inverse_transform(normalized)\n",
    "for i in range(5):\n",
    "    print(inversed[i])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Time series feature engineering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Date Time Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data[\"hour\"] = [ts_data.index[i].hour for i in range(len(ts_data))]\n",
    "ts_data[\"month\"] = [ts_data.index[i].month for i in range(len(ts_data))]\n",
    "ts_data[\"dayofweek\"] = [ts_data.index[i].day for i in range(len(ts_data))]\n",
    "print(ts_data.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lagged Features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  Shift function with DateOffset class and offset alias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data = load_data(data_dir)\n",
    "ts_data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_shift = ts_data.shift(4, freq=pd.offsets.BDay())\n",
    "ts_data_shift.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "ts_data_shift_2 = ts_data.tshift(6, freq=\"D\")\n",
    "ts_data_shift_2.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "source": [
    "#### Shift() function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generated_lagged_features(ts_data, var, max_lag):\n",
    "    for t in range(1, max_lag + 1):\n",
    "        ts_data[var + \"_lag\" + str(t)] = ts_data[var].shift(t, freq=\"1H\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "generated_lagged_features(ts_data, \"load\", 8)\n",
    "generated_lagged_features(ts_data, \"temp\", 8)\n",
    "print(ts_data.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Rolling Window Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import concat\n",
    "\n",
    "load_val = ts_data[[\"load\"]]\n",
    "shifted = load_val.shift(1)\n",
    "\n",
    "window = shifted.rolling(window=6)\n",
    "means = window.mean()\n",
    "new_dataframe = concat([means, load_val], axis=1)\n",
    "new_dataframe.columns = [\"load_rol_mean\", \"load\"]\n",
    "\n",
    "print(new_dataframe.head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import concat\n",
    "\n",
    "load_val = ts_data[[\"load\"]]\n",
    "width = 4\n",
    "shifted = load_val.shift(width - 1)\n",
    "window = shifted.rolling(window=width)\n",
    "\n",
    "new_dataframe = pd.concat([window.min(), window.mean(), window.max(), load_val], axis=1)\n",
    "new_dataframe.columns = [\"min\", \"mean\", \"max\", \"load\"]\n",
    "\n",
    "print(new_dataframe.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Expanding Window Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import concat\n",
    "\n",
    "load_val = ts_data[[\"load\"]]\n",
    "window = load_val.expanding()\n",
    "new_dataframe = concat(\n",
    "    [window.min(), window.mean(), window.max(), load_val.shift(-1)], axis=1\n",
    ")\n",
    "new_dataframe.columns = [\"min\", \"mean\", \"max\", \"load+1\"]\n",
    "print(new_dataframe.head(10))"
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "formats": "ipynb,py:light"
  },
  "kernel_info": {
   "name": "python3-azureml"
  },
  "kernelspec": {
   "display_name": "Python 3.6 - AzureML",
   "language": "python",
   "name": "python3-azureml"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  },
  "nteract": {
   "version": "nteract-front-end@1.0.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
